DA Final Project¶

Predict Future Sales¶

Team Members: Krisha Shah, Bhavya Shah¶

In [1]:
# Data handling
import pandas as pd
import numpy as np
import datetime
from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm

# Visualisations
!pip install chart_studio
import seaborn as sns
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from scipy import stats
from wordcloud import WordCloud
from wordcloud import STOPWORDS

# Machine learning
import xgboost as xgb
from sklearn import preprocessing
from xgboost import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from lightgbm import LGBMRegressor
Requirement already satisfied: chart_studio in c:\users\krish\anaconda3\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (5.9.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (1.3.3)
Requirement already satisfied: requests in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (2.28.1)
Requirement already satisfied: six in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\krish\anaconda3\lib\site-packages (from plotly->chart_studio) (8.0.1)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (2022.9.24)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (1.26.11)
In [2]:
training_data = pd.read_csv("sales_train.csv",parse_dates = ['date'], infer_datetime_format = True, dayfirst = True)
test = pd.read_csv("test.csv")
items_data = pd.read_csv("items.csv")
shops_data = pd.read_csv("shops.csv")
item_categories = pd.read_csv("item_categories.csv")
sample_submission = pd.read_csv("sample_submission.csv")
In [3]:
# Showcase the initial 5 rows of the training_data dataset which needs to be modelled for future prediction
training_data.head()
Out[3]:
date date_block_num shop_id item_id item_price item_cnt_day
0 2013-01-02 0 59 22154 999.00 1.0
1 2013-01-03 0 25 2552 899.00 1.0
2 2013-01-05 0 25 2552 899.00 -1.0
3 2013-01-06 0 25 2554 1709.05 1.0
4 2013-01-15 0 25 2555 1099.00 1.0
In [4]:
# Showcase the initial 5 rows of the test dataset
# It contains item_id, shop_id
# Need to predict item_cnt_day
test.head()
Out[4]:
ID shop_id item_id
0 0 5 5037
1 1 5 5320
2 2 5 5233
3 3 5 5232
4 4 5 5268
In [5]:
# Showcase the initial 5 rows of the items dataset
# It contains item_name, item_id, item_category_id
items_data.head()
Out[5]:
item_name item_id item_category_id
0 ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D 0 40
1 !ABBYY FineReader 12 Professional Edition Full... 1 76
2 ***В ЛУЧАХ СЛАВЫ (UNV) D 2 40
3 ***ГОЛУБАЯ ВОЛНА (Univ) D 3 40
4 ***КОРОБКА (СТЕКЛО) D 4 40
In [6]:
# Showcase the initial 5 rows of the shops dataset
# It contains shop_name, shop_id
shops_data.head()
Out[6]:
shop_name shop_id
0 !Якутск Орджоникидзе, 56 фран 0
1 !Якутск ТЦ "Центральный" фран 1
2 Адыгея ТЦ "Мега" 2
3 Балашиха ТРК "Октябрь-Киномир" 3
4 Волжский ТЦ "Волга Молл" 4
In [7]:
# Showcase the initial 5 rows of the items_categories dataset
# It contains item_category_name and id
item_categories.head()
Out[7]:
item_category_name item_category_id
0 PC - Гарнитуры/Наушники 0
1 Аксессуары - PS2 1
2 Аксессуары - PS3 2
3 Аксессуары - PS4 3
4 Аксессуары - PSP 4
In [8]:
# Showcase the initial 5 rows of sample submission dataset
sample_submission.head()
Out[8]:
ID item_cnt_month
0 0 0.5
1 1 0.5
2 2 0.5
3 3 0.5
4 4 0.5
In [12]:
#Size of all the datasets

print("Size of Training Dataset: ", training_data.shape)
print("Size of Testing Dataset: ", test.shape)
print("Size of Items Dataset: ", items_data.shape)
print("Size of Shops Dataset: ", shops_data.shape)
print("Size of Item Categories Dataset, ", item_categories.shape)
print("Size of Sample Submssion Dataset: ", sample_submission.shape)
Size of Training Dataset:  (2935849, 6)
Size of Testing Dataset:  (214200, 3)
Size of Items Dataset:  (22170, 3)
Size of Shops Dataset:  (60, 2)
Size of Item Categories Dataset,  (84, 2)
Size of Sample Submssion Dataset:  (214200, 2)
In [13]:
# Merging all the files with common parameters as shop_id, item_id, item_category_id
df = pd.merge(training_data, shops_data, how = 'left', left_on = 'shop_id', right_on = 'shop_id')
df = pd.merge(df, items_data, how = 'left', left_on = 'item_id', right_on = 'item_id')
df = pd.merge(df, item_categories, how = 'left', left_on = 'item_category_id', right_on = 'item_category_id')
df.drop(['item_id', 'item_category_id', 'shop_id'], axis = 1, inplace = True)
df.head()
Out[13]:
date date_block_num item_price item_cnt_day shop_name item_name item_category_name
0 2013-01-02 0 999.00 1.0 Ярославль ТЦ "Альтаир" ЯВЛЕНИЕ 2012 (BD) Кино - Blu-Ray
1 2013-01-03 0 899.00 1.0 Москва ТРК "Атриум" DEEP PURPLE The House Of Blue Light LP Музыка - Винил
2 2013-01-05 0 899.00 -1.0 Москва ТРК "Атриум" DEEP PURPLE The House Of Blue Light LP Музыка - Винил
3 2013-01-06 0 1709.05 1.0 Москва ТРК "Атриум" DEEP PURPLE Who Do You Think We Are LP Музыка - Винил
4 2013-01-15 0 1099.00 1.0 Москва ТРК "Атриум" DEEP PURPLE 30 Very Best Of 2CD (Фирм.) Музыка - CD фирменного производства
In [14]:
# The information of the dataframe created by merging the datasets
# The datatypes of the attributes are shown specifically
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 7 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   date_block_num      int64         
 2   item_price          float64       
 3   item_cnt_day        float64       
 4   shop_name           object        
 5   item_name           object        
 6   item_category_name  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 179.2+ MB
In [15]:
# Size of merged dataset before removing duplicates
df.shape
Out[15]:
(2935849, 7)
In [16]:
# Check for duplicates
df.duplicated().sum()
Out[16]:
6
In [17]:
# Remove the duplicates
df.drop_duplicates(keep = 'first', inplace = True)
In [18]:
# Size of merged dataset after removing duplicates
df.shape
Out[18]:
(2935843, 7)
In [19]:
# Basic statistical information about the data (mean, sd, min, max)
df.describe()
Out[19]:
date_block_num item_price item_cnt_day
count 2.935843e+06 2.935843e+06 2.935843e+06
mean 1.456991e+01 8.908535e+02 1.242641e+00
std 9.422992e+00 1.729801e+03 2.618837e+00
min 0.000000e+00 -1.000000e+00 -2.200000e+01
25% 7.000000e+00 2.490000e+02 1.000000e+00
50% 1.400000e+01 3.990000e+02 1.000000e+00
75% 2.300000e+01 9.990000e+02 1.000000e+00
max 3.300000e+01 3.079800e+05 2.169000e+03
In [20]:
# Explore the distbution sales data
data = training_data[['shop_id','item_price', 'item_cnt_day']]
def num_plot(df, var):
    fig = plt.figure(figsize = (16, 5))
    plt.subplot(1,3,1)
    df[var].hist(bins = 40)
    plt.title(f"Distribution of {var}")
    
    plt.subplot(1,3,2)
    stats.probplot(df[var], dist = "norm", plot = plt)
    plt.ylabel('Quantiles')
    
    plt.subplot(1,3,3)
    sns.boxplot(y = df[var],color="red")
    plt.title("Boxplot")
    plt.show()
    
for var in data:
    num_plot(data, var)
In [21]:
# Make a plot to show the frequence of sales among shops
plt.figure(figsize = (20, 20))
sns.countplot(y=df['shop_name'], palette = ("Greens"))
plt.xlabel('Number of sold items in each shop')
plt.ylabel('Name of the shop')
Out[21]:
Text(0, 0.5, 'Name of the shop')
In [22]:
# Show shop daily sales
shop_daily_sales = training_data.groupby(['shop_id', 'date_block_num'], as_index = False)['item_cnt_day'].sum()

shop_daily_sales_sc = []
for shop in shop_daily_sales['shop_id'].unique():
    current_shop_daily_sales = shop_daily_sales[(shop_daily_sales['shop_id']) == shop]
    shop_daily_sales_sc.append(go.Scatter(x=current_shop_daily_sales['date_block_num'], y = current_shop_daily_sales['item_cnt_day'], name=('shop_id %s' % shop)))
    
layout = go.Layout(title='Shop daily sales', xaxis=dict(title='Time (2013 - 2015)'), yaxis=dict(title='Sales'))
fig = go.Figure(data = shop_daily_sales_sc, layout=layout)
iplot(fig)  
In [23]:
# Output the first 10 best selling items
best_selling_items = df['item_name'].value_counts().head(10)
best_selling_items = best_selling_items.to_frame().reset_index()
best_selling_items.columns = ['item_name', 'item_count']
best_selling_items.head(10)
Out[23]:
item_name item_count
0 Фирменный пакет майка 1С Интерес белый (34*42)... 31340
1 Playstation Store пополнение бумажника: Карта ... 9408
2 Прием денежных средств для 1С-Онлайн 9067
3 Diablo III [PC, Jewel, русская версия] 7479
4 Kaspersky Internet Security Multi-Device Russi... 6853
5 World of Warcraft. Карта оплаты игрового време... 6602
6 Grand Theft Auto V [PS3, русские субтитры] 6475
7 Call of Duty: Black Ops II [PС, Jewel, русская... 6320
8 Minecraft [Xbox 360, английская версия] 5811
9 Grand Theft Auto V [Xbox 360, русские субтитры] 5805
In [24]:
# Plotting a graph
fig = plt.figure(figsize = (20, 10))
sns.barplot(x=best_selling_items['item_count'], y=best_selling_items['item_name'], data=best_selling_items)
plt.xlabel('Item Count')
plt.ylabel('Item Name')
Out[24]:
Text(0, 0.5, 'Item Name')
In [25]:
# Output the first 10 best selling categories
best_selling_categories = df['item_category_name'].value_counts().head(10)
best_selling_categories = best_selling_categories.to_frame().reset_index()
best_selling_categories.columns = ['item_category_name', 'count']
best_selling_categories.head(10)
Out[25]:
item_category_name count
0 Кино - DVD 564651
1 Игры PC - Стандартные издания 351591
2 Музыка - CD локального производства 339585
3 Игры - PS3 208219
4 Кино - Blu-Ray 192673
5 Игры - XBOX 360 146785
6 Игры PC - Дополнительные издания 121539
7 Игры - PS4 79058
8 Подарки - Мягкие игрушки 53845
9 Подарки - Настольные игры (компактные) 53227
In [26]:
# Plotting a graph
fig = plt.figure(figsize = (20, 10))
sns.barplot(x = best_selling_categories['count'], y = best_selling_categories['item_category_name'], data = best_selling_categories)
plt.xlabel('Category Count')
plt.ylabel('Item Name')
Out[26]:
Text(0, 0.5, 'Item Name')
In [27]:
# Making a word cloud for shop name

plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud1 = WordCloud(background_color = 'pink',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1000,
                     height = 1000,
                     random_state = 42).generate(str(shops_data['shop_name']))


plt.title('Wordcloud for Shop Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud1, interpolation = 'bilinear')
Out[27]:
<matplotlib.image.AxesImage at 0x23992160760>
In [28]:
# Making a word cloud for item name

plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud2 = WordCloud(background_color = 'lightgreen',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1000,
                     height = 1000,
                     random_state = 42).generate(str(items_data['item_name']))


plt.title('Wordcloud for Item Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud2, interpolation = 'bilinear')
Out[28]:
<matplotlib.image.AxesImage at 0x239a35eb9a0>
In [29]:
# Making a word cloud for item categories name

plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud3 = WordCloud(background_color = 'lightblue',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1000,
                     height = 1000,
                     random_state = 42).generate(str(item_categories['item_category_name']))


plt.title('Wordcloud for Item Category Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud3, interpolation = 'bilinear')
Out[29]:
<matplotlib.image.AxesImage at 0x239924c02b0>

Eliminating Outliers¶

In [30]:
# Calculate IQR
Q1 = np.percentile(df.item_price,25)/100
Q3 = np.percentile(df.item_price,75)/100
IQR = Q3-Q1

# Drop item_price 1.5 times IQR above the below the max and min item_price
# Drop item_cnt_day with standard deviation higher than 3 
training_data.drop(training_data[training_data.item_price > 1.5*IQR + training_data.item_price.max()].index, inplace = True)
training_data.drop(training_data[training_data.item_price < 1.5*IQR + training_data.item_price.min()].index, inplace = True)
training_data.drop(training_data[training_data.item_cnt_day > 3 * training_data.item_cnt_day.std()].index, inplace = True)

# There are a few negative item_counts
# Thus we need to eliminate negative values
training_data.drop(training_data[training_data.item_price < 0].index, inplace = True)
training_data.drop(training_data[training_data.item_cnt_day < 0].index, inplace = True)
In [31]:
# Calculate revenue
training_data['revenue'] = training_data.item_price * training_data.item_cnt_day
In [32]:
# Group data based on shop id and item id then get value of sales per month
pivot_training_data = training_data.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num', values='item_cnt_day', aggfunc='sum').fillna(0.0)
pivot_training_data.head(20)
Out[32]:
date_block_num 0 1 2 3 4 5 6 7 8 9 ... 24 25 26 27 28 29 30 31 32 33
shop_id item_id
0 30 0.0 22.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
31 0.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
32 6.0 10.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
33 3.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
35 1.0 14.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
36 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
40 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
42 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
43 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
51 2.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
57 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
59 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
61 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
75 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
85 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
88 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
95 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
96 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
97 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

20 rows × 34 columns

In [33]:
# Size of data after removing outliers
print("Size of Training Dataset after removing outliers: ",training_data.shape)

# Size of data after creating pivot table
print("Size of Pivot Dataset: ",pivot_training_data.shape)
Size of Training Dataset after removing outliers:  (2874701, 7)
Size of Pivot Dataset:  (423533, 34)
In [34]:
# Data Cleaning
training_df = pivot_training_data.reset_index()
training_df["shop_id"] = training_df.shop_id.astype("str")
training_df["item_id"] = training_df.item_id.astype("str")
In [35]:
# Merge the items with categories
item_cat_df = items_data.merge(item_categories[['item_category_id','item_category_name']], how="inner", on="item_category_id")[['item_id','item_category_name']]
item_cat_df['item_id'] = item_cat_df.item_id.astype('str')

# Merge the train dataframe with the items and categories dataframe
training_df = training_df.merge(item_cat_df, how="inner", on="item_id")

Prediction Model 1¶

In [36]:
# Perform Label encoding
number = preprocessing.LabelEncoder()
# Check the variables in the train dataframe
training_df
Out[36]:
shop_id item_id 0 1 2 3 4 5 6 7 ... 25 26 27 28 29 30 31 32 33 item_category_name
0 0 30 0.0 22.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - DVD
1 1 30 0.0 10.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - DVD
2 2 30 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - DVD
3 3 30 0.0 4.0 5.0 2.0 2.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 Кино - DVD
4 4 30 0.0 7.0 3.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - DVD
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
423528 59 9760 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - Blu-Ray
423529 59 12846 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Кино - Blu-Ray
423530 59 16270 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Программы - Для дома и офиса
423531 59 17732 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Игры PC - Дополнительные издания
423532 59 18018 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Музыка - MP3

423533 rows × 37 columns

In [37]:
# Encode the column with categorical data
training_df['item_category_name'] = number.fit_transform(training_df['item_category_name'])
In [38]:
# Rearrange the columns
training_df = training_df[['shop_id', 'item_id', 'item_category_name'] + list(range(34))]
training_df.head()
Out[38]:
shop_id item_id item_category_name 0 1 2 3 4 5 6 ... 24 25 26 27 28 29 30 31 32 33
0 0 30 40 0.0 22.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1 30 40 0.0 10.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 30 40 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 3 30 40 0.0 4.0 5.0 2.0 2.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
4 4 30 40 0.0 7.0 3.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 37 columns

In [39]:
# Modelling
param = {'max_depth': 13, 
         'subsample':1,
         'min_child_weight': 0.7,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse',
         'n_estimators': 120}

# Generate a DMatrix to train the model
xgbtrain = xgb.DMatrix(training_df.iloc[:,  (training_df.columns != 33)].values, training_df.iloc[:, training_df.columns == 33].values)
watchlist  = [(xgbtrain,'train-rmse')]

# Train the model
bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(training_df.iloc[:,  (training_df.columns != 33)].values))
rmse = np.sqrt(mean_squared_error(preds,training_df.iloc[:, training_df.columns == 33].values))

# Print the RMSE of the model
print("RMSE Value: ",rmse)
[16:56:06] WARNING: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/learner.cc:767: 
Parameters: { "n_estimators", "num_round", "silent" } are not used.

RMSE Value:  0.4302462208714635
In [40]:
# See the the most important features
fig, ax = plt.subplots(figsize=(20, 10))
xgb.plot_importance(bst, ax = ax, importance_type='gain')
Out[40]:
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
In [41]:
# Preparing the test data frame
test_df = test
test_df.shop_id = test_df.shop_id.astype('str')
test_df.item_id = test_df.item_id.astype('str')
In [42]:
test_df = test.merge(training_df, how="left", on=["shop_id", "item_id"]).fillna(0.0)
# Check the first five variables of the test dataframe
test_df.head()
Out[42]:
ID shop_id item_id item_category_name 0 1 2 3 4 5 ... 24 25 26 27 28 29 30 31 32 33
0 0 5 5037 19.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 0.0 0.0 0.0 1.0 1.0 1.0 3.0 1.0 0.0
1 1 5 5320 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 5 5233 19.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 3.0 2.0 0.0 1.0 3.0 1.0
3 3 5 5232 23.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 4 5 5268 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 38 columns

In [43]:
d = dict(zip(test_df.columns[4:],list(np.array(list(test_df.columns[4:])) - 1)))

test_df = test_df.rename(d, axis = 1)
In [44]:
# Make sales predictions for the test data
predictions = bst.predict(xgb.DMatrix(test_df.iloc[:, (test_df.columns != 'ID') & (test_df.columns != -1)].values))
In [45]:
predictions = list(map(lambda x: min(20, max(x,0)), list(predictions)))
submission = pd.DataFrame({'ID': test_df.ID, 'item_cnt_month':predictions})
submission['item_cnt_month']  = round(submission['item_cnt_month'] , 2)
In [46]:
submission.to_csv('submission1.csv', index=False)
In [47]:
print(submission)
            ID  item_cnt_month
0            0            0.46
1            1            0.15
2            2            0.32
3            3            0.13
4            4            0.15
...        ...             ...
214195  214195            0.14
214196  214196            0.13
214197  214197            0.05
214198  214198            0.13
214199  214199            0.05

[214200 rows x 2 columns]

Prediction Model 2¶

In [48]:
# Make a dataset with only monthly sales data
data = training_data.groupby([training_data['date'].apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()

# Specify the important attributes which we want to add to the data
data = data[['date','item_id','shop_id','item_cnt_day']]

# Group data based on shop id and item id then get value of sales per month
data = data.pivot_table(index=['item_id','shop_id'], columns = 'date', values = 'item_cnt_day', fill_value = 0).reset_index()

data.shape
Out[48]:
(423533, 36)
In [49]:
# Merge the monthly sales data prepared to the test data set
test = pd.read_csv("test.csv")
test = pd.merge(test, data, on = ['item_id', 'shop_id'], how = 'left')

# Filling the empty values found in the dataset
test.fillna(0, inplace = True)
test.head()
Out[49]:
ID shop_id item_id 2013-01 2013-02 2013-03 2013-04 2013-05 2013-06 2013-07 ... 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07 2015-08 2015-09 2015-10
0 0 5 5037 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 0.0 0.0 0.0 1.0 1.0 1.0 3.0 1.0 0.0
1 1 5 5320 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 5 5233 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 3.0 2.0 0.0 1.0 3.0 1.0
3 3 5 5232 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 4 5 5268 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 37 columns

In [50]:
# Create the actual training data

x_train = test.drop(['2015-10', 'item_id', 'shop_id'], axis = 1)
y_train = test['2015-10']

# Deleting the first column so that it can predict the future sales data
x_test = test.drop(['2013-01', 'item_id', 'shop_id'], axis = 1)

print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
Shape of x_train : (214200, 34)
Shape of x_test : (214200, 34)
Shape of y_train : (214200,)
In [51]:
# Splitting the data into train and valid dataset

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

print("Shape of x_train :", x_train.shape)
print("Shape of x_valid :", x_valid.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_valid :", y_valid.shape)
Shape of x_train : (171360, 34)
Shape of x_valid : (42840, 34)
Shape of y_train : (171360,)
Shape of y_valid : (42840,)
In [52]:
# MODELING
model_lgb = LGBMRegressor( n_estimators=200,
                           learning_rate=0.03,
                           num_leaves=32,
                           colsample_bytree=0.9497036,
                           subsample=0.8715623,
                           max_depth=8,
                           reg_alpha=0.04,
                           reg_lambda=0.073,
                           min_split_gain=0.0222415,
                           min_child_weight=40)
model_lgb.fit(x_train, y_train)

y_pred_lgb = model_lgb.predict(x_test)
In [53]:
# Generate the model
y_pred_lgb = model_lgb.predict(x_test).clip(0., 20.)

# Create the submission file and submit
preds = pd.DataFrame(y_pred_lgb, columns=['item_cnt_month'])
preds.to_csv('submission2.csv',index_label='ID')
In [54]:
submission.to_csv('submission2.csv', index=False)
In [55]:
print(submission)
            ID  item_cnt_month
0            0            0.46
1            1            0.15
2            2            0.32
3            3            0.13
4            4            0.15
...        ...             ...
214195  214195            0.14
214196  214196            0.13
214197  214197            0.05
214198  214198            0.13
214199  214199            0.05

[214200 rows x 2 columns]

Prediction Model 2 has better outcome. The submission file had item counts with decimal value greater than 5 which was then rounded to 2digits after decimal. The rsme value of the first model dropped from 0.8 to 0.43 by changing the max_depth parameters appropriately.

Results:¶

Submission 1 gives a public score of 1.06217 on Kaggle competition¶

Submission 2 gives a public score of 0.87922 which leads to Rank 1629 on the leaderboard¶